DRAFT

1 Introduction

1.1 Using the Hmisc library

getHdata(pbc)

# Have upData move units from labels to separate attribute
pbc <- upData(pbc,
              fu.yrs = fu.days / 365.25,
              labels = c(fu.yrs = 'Follow-up Time',
                         status = 'Death or Liver Transplantation'),
              units = c(fu.yrs = 'year'),
              drop  = 'fu.days',
              moveUnits=TRUE, html=TRUE)
Input object size:   80952 bytes;    19 variables    418 observations
 Label for bili changed from Serum Bilirubin (mg/dl) to Serum Bilirubin 
    units set to mg/dl 
 Label for albumin changed from Albumin (gm/dl) to Albumin 
    units set to gm/dl 
 Label for protime changed from Prothrombin Time (sec.) to Prothrombin Time 
    units set to sec. 
 Label for alk.phos changed from Alkaline Phosphatase (U/liter) to Alkaline Phosphatase 
    units set to U/liter 
 Label for sgot changed from SGOT (U/ml) to SGOT 
    units set to U/ml 
 Label for chol changed from Cholesterol (mg/dl) to Cholesterol 
    units set to mg/dl 
 Label for trig changed from Triglycerides (mg/dl) to Triglycerides 
    units set to mg/dl 
 Label for platelet changed from Platelets (per cm^3/1000) to Platelets 
    units set to per cm^3/1000 
 Label for copper changed from Urine Copper (ug/day) to Urine Copper 
    units set to ug/day 
 Added variable     fu.yrs
 Dropped variable   fu.days
 New object size:   84744 bytes;    19 variables    418 observations
# The following can also be done by running this command
# to put the results in a new browser tab:
# getHdata(pbc, 'contents')
html(contents(pbc), maxlevels=10, levelType='table')

Data frame:pbc

418 observations and 19 variables, maximum # NAs:136  
NameLabelsUnitsLevelsStorageNAs
biliSerum Bilirubinmg/dldouble 0
albuminAlbumingm/dldouble 0
stageHistologic Stage, Ludwig Criteriainteger 6
protimeProthrombin Timesec.double 2
sex2integer 0
ageAgedouble 0
spiders2integer106
hepatom2integer106
ascites2integer106
alk.phosAlkaline PhosphataseU/literdouble106
sgotSGOTU/mldouble106
cholCholesterolmg/dlinteger134
trigTriglyceridesmg/dlinteger136
plateletPlateletsper cm^3/1000integer110
drug3integer 0
statusDeath or Liver Transplantationinteger 0
edema3integer 0
copperUrine Copperug/dayinteger108
fu.yrsFollow-up Timeyeardouble 0

VariableLevels
sexmale
female
spiders, hepatomabsent
 ascitespresent
drugD-penicillamine
placebo
not randomized
edemano edema
edema, no diuretic therapy
edema despite diuretic therapy

# did have results='asis' above
d <- describe(pbc)
html(d, size=80, scroll=TRUE)
pbc

19 Variables   418 Observations

bili: Serum Bilirubin mg/dl
image
nmissingdistinctInfoMeanGmd.05.10.25.50.75.90.95
4180980.9983.2213.742 0.50 0.60 0.80 1.40 3.40 8.0314.00
lowest : 0.3 0.4 0.5 0.6 0.7 , highest: 21.6 22.5 24.5 25.5 28.0
albumin: Albumin gm/dl
image
nmissingdistinctInfoMeanGmd.05.10.25.50.75.90.95
418015413.4970.4732.7502.9673.2433.5303.7704.0104.141
lowest : 1.96 2.10 2.23 2.27 2.31 , highest: 4.30 4.38 4.40 4.52 4.64
stage: Histologic Stage, Ludwig Criteria
image
nmissingdistinctInfoMeanGmd
412640.8933.0240.9519
 Value          1     2     3     4
 Frequency     21    92   155   144
 Proportion 0.051 0.223 0.376 0.350
 

protime: Prothrombin Time sec.
image
nmissingdistinctInfoMeanGmd.05.10.25.50.75.90.95
4162480.99810.731.029 9.60 9.8010.0010.6011.1012.0012.45
lowest : 9.0 9.1 9.2 9.3 9.4 , highest: 13.8 14.1 15.2 17.1 18.0
sex
nmissingdistinct
41802
 Value        male female
 Frequency      44    374
 Proportion  0.105  0.895
 

age: Age
image
nmissingdistinctInfoMeanGmd.05.10.25.50.75.90.95
4180345150.7411.9633.8436.3742.8351.0058.2464.3067.92
lowest : 26.27789 28.88433 29.55510 30.27515 30.57358 , highest: 74.52430 75.00000 75.01164 76.70910 78.43943
spiders
nmissingdistinct
3121062
 Value       absent present
 Frequency      222      90
 Proportion   0.712   0.288
 

hepatom
nmissingdistinct
3121062
 Value       absent present
 Frequency      152     160
 Proportion   0.487   0.513
 

ascites
nmissingdistinct
3121062
 Value       absent present
 Frequency      288      24
 Proportion   0.923   0.077
 

alk.phos: Alkaline Phosphatase U/liter
image
nmissingdistinctInfoMeanGmd.05.10.25.50.75.90.95
312106295119831760 599.6 663.0 871.51259.01980.03826.46669.9
lowest : 289.0 310.0 369.0 377.0 414.0 , highest: 11046.6 11320.2 11552.0 12258.8 13862.4
sgot: SGOT U/ml
image
nmissingdistinctInfoMeanGmd.05.10.25.50.75.90.95
3121061791122.660.45 54.25 60.45 80.60114.70151.90196.47219.25
lowest : 26.35 28.38 41.85 43.40 45.00 , highest: 288.00 299.15 328.60 338.00 457.25
chol: Cholesterol mg/dl
image
nmissingdistinctInfoMeanGmd.05.10.25.50.75.90.95
2841342011369.5194.5188.4213.6249.5309.5400.0560.8674.0
lowest : 120 127 132 149 151 , highest: 1336 1480 1600 1712 1775
trig: Triglycerides mg/dl
image
nmissingdistinctInfoMeanGmd.05.10.25.50.75.90.95
2821361461124.764.07 56.00 63.10 84.25108.00151.00195.00230.95
lowest : 33 44 46 49 50 , highest: 319 322 382 432 598
platelet: Platelets per cm3/1000
image
nmissingdistinctInfoMeanGmd.05.10.25.50.75.90.95
3081102101261.9107.8117.7139.7199.8257.0322.5386.5430.6
lowest : 62 70 71 79 80 , highest: 493 514 518 539 563
drug
image
nmissingdistinct
41803
 Value      D-penicillamine         placebo  not randomized
 Frequency              154             158             106
 Proportion           0.368           0.378           0.254
 

status: Death or Liver Transplantation
nmissingdistinctInfoSumMeanGmd
418020.711610.38520.4748

edema
image
nmissingdistinct
41803
 Value                            no edema     edema, no diuretic therapy
 Frequency                             354                             44
 Proportion                          0.847                          0.105
                                          
 Value      edema despite diuretic therapy
 Frequency                              20
 Proportion                          0.048
 

copper: Urine Copper ug/day
image
nmissingdistinctInfoMeanGmd.05.10.25.50.75.90.95
310108158197.6583.16 17.45 24.00 41.25 73.00123.00208.10249.20
lowest : 4 9 10 11 12 , highest: 412 444 464 558 588
fu.yrs: Follow-up Time year
image
nmissingdistinctInfoMeanGmd.05.10.25.50.75.90.95
418039915.2513.429 0.671 1.661 2.992 4.736 7.155 9.64911.063
lowest : 0.1122519 0.1177276 0.1396304 0.1943874 0.2108145
highest:12.320328512.344969212.383299112.473648213.1279945

# prList is in Hmisc; useful for plotting or printing a list of objects
# Can just use plot(d) if don't care about the mess
# If using html output these 2 images would not be rendered no matter what
p <- plot(d)
# The option htmlfig=2 causes markupSpecs$html$cap() to be used to
# HTML-typeset as a figure caption and to put the sub-sub section
# marker ### in front of the caption.  htmlfig is the only reason
# results='asis' was needed in the chunk header
# We define a long caption for one of the plots, which does not appear
# in the table of contents
# prList works for html notebooks but not html documents
# prList(p, lcap=c('', 'These are spike histograms'), htmlfig=2)
htmltools::tagList(p)    # lapply(p, plotly::as.widget)
s <- summaryM(bili + albumin + stage + protime + sex + age + spiders +
              alk.phos + sgot + chol ~ drug, data=pbc,
                            overall=FALSE, test=TRUE)
plot(s, which='categorical')

Figure 1.1: Proportions and χ2 tests for categorical variables

plot(s, which='continuous', vars=1 : 4)

Figure 1.2: Extended box plots for the first 4 continuous variables

plot(s, which='continuous', vars=5 : 7)

Figure 1.3: Extended box plots for the remaining continuous variables

html(s, caption='Baseline characteristics by randomized treatment',
     exclude1=TRUE, npct='both', digits=3, middle.bold=TRUE,
     prmsd=TRUE, brmsd=TRUE, msdsize=mu$smaller2)

Baseline characteristics by randomized treatment.
N
D-penicillamine
N=154
placebo
N=158
not randomized
N=106
Test Statistic
Serum Bilirubin
mg/dl
418 0.725 1.300 3.600
3.649 ± 5.282
0.800 1.400 3.200
2.873 ± 3.629
0.725 1.400 3.075
3.117 ± 4.043
F2 415=0.03, P=0.9721
Albumin
gm/dl
418 3.342 3.545 3.777
3.524 ± 0.396
3.212 3.565 3.830
3.516 ± 0.443
3.125 3.470 3.720
3.431 ± 0.435
F2 415=2.13, P=0.121
Histologic Stage, Ludwig Criteria : 1 412 0.03 4154 0.08 12158 0.05 5100 χ26=5.33, P=0.5022
  2 0.21 32154 0.22 35158 0.25 25100
  3 0.42 64154 0.35 56158 0.35 35100
  4 0.35 54154 0.35 55158 0.35 35100
Prothrombin Time
sec.
416 10.000 10.600 11.400
10.800 ±  1.138
10.025 10.600 11.000
10.653 ±  0.851
10.100 10.600 11.000
10.750 ±  1.078
F2 413=0.23, P=0.7951
sex : female 418 0.90 139154 0.87 137158 0.92 98106 χ22=2.38, P=0.3042
Age 418 41.43 48.11 55.80
48.58 ±  9.96
42.98 51.93 58.90
51.42 ± 11.01
46.00 53.00 61.00
52.87 ±  9.78
F2 415=6.1, P=0.0021
spiders 312 0.29 45154 0.28 45158 χ21=0.02, P=0.8852
Alkaline Phosphatase
U/liter
312 922 1283 1950
1943 ± 2102
841 1214 2028
2021 ± 2183
F1 310=0.06, P=0.8123
SGOT
U/ml
312 83.8 117.4 151.9
125.0 ±  58.9
76.7 111.6 151.5
120.2 ±  54.5
F1 310=0.55, P=0.463
Cholesterol
mg/dl
284 254 304 377
374 ± 252
248 316 417
365 ± 210
F1 282=0.37, P=0.5453
a b c represent the lower quartile a, the median b, and the upper quartile c for continuous variables. x ± s represents X ± 1 SD.   N is the number of non-missing values.
Tests used: 1Kruskal-Wallis test; 2Pearson test; 3Wilcoxon test .
# Statistical analysis methods

2 Results

2.1 Using the rms library for regression models

require(rms)
getHdata(lead)
# Subset variables just so contents() and describe() output is short
# Override units of measurement to make them legal R expressions
lead <- upData(lead,
               keep=c('ld72', 'ld73', 'age', 'maxfwt'),
               labels=c(age='Age'),
               units=c(age='years', ld72='mg/100*ml', ld73='mg/100*ml'))
Input object size:   53928 bytes;    39 variables    124 observations
Kept variables  ld72,ld73,age,maxfwt
New object size:    14096 bytes;    4 variables 124 observations

To use Predict, summary, or nomogram in the rms package, you need to let rms first compute summaries of the distributional characteristics of the predictors

dd <- datadist(lead); options(datadist='dd')
dd    # show what datadist computed
                      age  ld72  ld73 maxfwt
Low:effect       6.166667 27.00 24.00   46.0
Adjust to        8.375000 34.00 30.50   52.0
High:effect     12.020833 43.00 37.00   59.0
Low:prediction   3.929167 18.00 18.15   33.2
High:prediction 15.000000 61.85 50.85   72.2
Low              3.750000  1.00 15.00   13.0
High            15.916667 99.00 58.00   84.0

Fit an ordinary least squares regrssion model

# Fit an ordinary linear regression model with 3 predictors assumed linear
f <- ols(maxfwt ~ age + ld72 + ld73, data=lead)

f$coef
 Intercept        age       ld72       ld73 
34.1058551  2.6078450 -0.0245978 -0.2389695 

Unofrtunately, as far as I can tell there is no nice way to display the regression table in HTML. This oucld be done manually

beta = coef(olsfit). se = sqrt(diag(vcov(olsfit))) p = 1 - pchisq(Z^2, 1)

To get t-statistic pvalues need to know the df … Z <- beta/se p = 2 * (1 - pt(abs(Z), errordf))

Another approach is to use print.capture see https://stackoverflow.com/questions/47724189/extract-all-model-statistics-from-rms-fits

#parser
get_model_stats = function(x) {
  cap = capture.output(print(x))

  #model stats
  stats = c()
  stats$R2.adj = str_match(cap, "R2 adj\\s+ (\\d\\.\\d+)") %>% na.omit() %>% .[, 2] %>% as.numeric()

  #coef stats lines
  coef_lines = cap[which(str_detect(cap, "Coef\\s+S\\.E\\.")):(length(cap) - 1)]

  #parse
  coef_lines_table = suppressWarnings(readr::read_table(coef_lines %>% stringr::str_c(collapse = "\n")))
  colnames(coef_lines_table)[1] = "Predictor"

  list(
    stats = stats,
    coefs = coef_lines_table
  )
}


get_model_stats(f)
$stats
$stats$R2.adj
[1] 0.45


$coefs
# A tibble: 4 x 5
  Predictor    Coef   S.E.     t `Pr(>|t|)`
  <chr>       <dbl>  <dbl> <dbl> <chr>     
1 Intercept 34.1    4.84    7.04 <0.0001   
2 age        2.61   0.323   8.07 <0.0001   
3 ld72      -0.0246 0.0782 -0.31 0.7538    
4 ld73      -0.239  0.132  -1.8  0.0744    

Residual plots:

r <- resid(f)
par(mfrow=c(2,2))   # 2x2 matrix of plots
plot(fitted(f), r); abline(h=0)  # yhat vs. r
with(lead, plot(age,  r));    abline(h=0)
with(lead, plot(ld73, r));    abline(h=0)
qqnorm(r)           # linearity indicates normality
qqline(as.numeric(r))

3 Discussion

4 Appendix

R version 4.0.3 (2020-10-10)

Platform: x86_64-pc-linux-gnu (64-bit)

locale: LC_CTYPE=en_AU.utf8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.utf8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.utf8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C

attached base packages:

  • stats
  • graphics
  • grDevices
  • utils
  • datasets
  • methods
  • base

other attached packages:

  • rms(v.6.0-1)
  • SparseM(v.1.78)
  • Hmisc(v.4.4-1)
  • Formula(v.1.2-3)
  • survival(v.3.2-7)
  • lattice(v.0.20-41)
  • kableExtra(v.1.2.1)
  • here(v.0.1)
  • forcats(v.0.5.0)
  • stringr(v.1.4.0)
  • dplyr(v.1.0.2)
  • purrr(v.0.3.4)
  • readr(v.1.3.1)
  • tidyr(v.1.1.2)
  • tibble(v.3.0.3)
  • ggplot2(v.3.3.2)
  • tidyverse(v.1.3.0)
  • nvimcom(v.0.9-102)

loaded via a namespace (and not attached):

  • TH.data(v.1.0-10)
  • colorspace(v.1.4-1)
  • ellipsis(v.0.3.1)
  • rprojroot(v.1.3-2)
  • htmlTable(v.2.0.1)
  • base64enc(v.0.1-3)
  • fs(v.1.5.0)
  • rstudioapi(v.0.11)
  • farver(v.2.0.3)
  • MatrixModels(v.0.4-1)
  • fansi(v.0.4.1)
  • mvtnorm(v.1.1-1)
  • lubridate(v.1.7.9)
  • xml2(v.1.3.2)
  • codetools(v.0.2-16)
  • splines(v.4.0.3)
  • knitr(v.1.29)
  • jsonlite(v.1.7.1)
  • broom(v.0.7.0)
  • cluster(v.2.1.0)
  • dbplyr(v.1.4.4)
  • png(v.0.1-7)
  • compiler(v.4.0.3)
  • httr(v.1.4.2)
  • backports(v.1.1.9)
  • assertthat(v.0.2.1)
  • Matrix(v.1.2-18)
  • lazyeval(v.0.2.2)
  • cli(v.2.2.0)
  • htmltools(v.0.5.0)
  • quantreg(v.5.67)
  • tools(v.4.0.3)
  • gtable(v.0.3.0)
  • glue(v.1.4.2)
  • Rcpp(v.1.0.5)
  • cellranger(v.1.1.0)
  • vctrs(v.0.3.4)
  • nlme(v.3.1-149)
  • conquer(v.1.0.2)
  • crosstalk(v.1.1.0.1)
  • xfun(v.0.17)
  • ps(v.1.3.4)
  • rvest(v.0.3.6)
  • lifecycle(v.0.2.0)
  • polspline(v.1.1.19)
  • MASS(v.7.3-53)
  • zoo(v.1.8-8)
  • scales(v.1.1.1)
  • hms(v.0.5.3)
  • sandwich(v.3.0-0)
  • RColorBrewer(v.1.1-2)
  • yaml(v.2.2.1)
  • gridExtra(v.2.3)
  • pander(v.0.6.3)
  • rpart(v.4.1-15)
  • latticeExtra(v.0.6-29)
  • stringi(v.1.5.3)
  • checkmate(v.2.0.0)
  • rlang(v.0.4.7)
  • pkgconfig(v.2.0.3)
  • matrixStats(v.0.56.0)
  • evaluate(v.0.14)
  • htmlwidgets(v.1.5.1)
  • labeling(v.0.3)
  • tidyselect(v.1.1.0)
  • magrittr(v.1.5)
  • bookdown(v.0.21)
  • R6(v.2.4.1)
  • generics(v.0.0.2)
  • multcomp(v.1.4-15)
  • DBI(v.1.1.0)
  • pillar(v.1.4.6)
  • haven(v.2.3.1)
  • foreign(v.0.8-80)
  • withr(v.2.2.0)
  • nnet(v.7.3-14)
  • modelr(v.0.1.8)
  • crayon(v.1.3.4)
  • utf8(v.1.1.4)
  • plotly(v.4.9.2.1)
  • rmarkdown(v.2.5)
  • viridis(v.0.5.1)
  • jpeg(v.0.1-8.1)
  • grid(v.4.0.3)
  • readxl(v.1.3.1)
  • data.table(v.1.13.0)
  • blob(v.1.2.1)
  • reprex(v.0.3.0)
  • digest(v.0.6.25)
  • webshot(v.0.5.2)
  • munsell(v.0.5.0)
  • viridisLite(v.0.3.0)